YOLOv3 model¶
In this assignment, I applied a YOLOv3 model for object detection on images using Keras.
The tasks:
- Download and save the pre-trained model weights.
- Create the YOLOv3 model.
- Make a prediction on three images (e.g., images of people, animals,objects etc.) and report the model’s output.
- Explain the purpose of the non-max suppression.
- Repeat step 3 by using the different values for non-max suppression with defferent valve and what we can observe.
- Download and save the pre-trained model weights
To perform object detection with YOLOv3 in Keras, the first step is to download the pre-trained model weights.These were trained using the DarkNet code base on the MSCOCO dataset. Download the model weights and place them into the current working directory with the filename “yolov3.weights” . The pre-trained model weights link is https://pjreddie.com/media/files/yolov3.weights .
- Create the YOLOv3 model
We define a Keras model that has the right number and type of layers to match the downloaded model weights. The model architecture is called a “DarkNet” and was originally loosely based on the VGG-16 model.
# import the libs
import struct
import numpy as np
from keras.layers import Conv2D
from keras.layers import Input
from keras.layers import BatchNormalization
from keras.layers import LeakyReLU
from keras.layers import ZeroPadding2D
from keras.layers import UpSampling2D
from keras.layers import add, concatenate
from keras.models import Model
from numpy import expand_dims
from keras.models import load_model
from tensorflow.keras.utils import load_img
from tensorflow.keras.utils import img_to_array
from matplotlib import pyplot
from matplotlib.patches import Rectangle
import cv2
(1) Create the YOLOv3 model
#create a cnn blaock for make_yolov3_maodel()class WeightReader:
def _conv_block(inp, convs, skip=True):
x = inp
count = 0
for conv in convs:
if count == (len(convs) - 2) and skip:
skip_connection = x
count += 1
if conv['stride'] > 1: x = ZeroPadding2D(((1,0),(1,0)))(x) # peculiar padding as darknet prefer left and top
x = Conv2D(conv['filter'],
conv['kernel'],
strides=conv['stride'],
padding='valid' if conv['stride'] > 1 else 'same', # peculiar padding as darknet prefer left and top
name='conv_' + str(conv['layer_idx']),
use_bias=False if conv['bnorm'] else True)(x)
if conv['bnorm']: x = BatchNormalization(epsilon=0.001, name='bnorm_' + str(conv['layer_idx']))(x)
if conv['leaky']: x = LeakyReLU(alpha=0.1, name='leaky_' + str(conv['layer_idx']))(x)
return add([skip_connection, x]) if skip else x
#create a yolov3 model
def make_yolov3_model():
input_image = Input(shape=(None, None, 3))
# Layer 0 => 4
x = _conv_block(input_image, [{'filter': 32, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 0},
{'filter': 64, 'kernel': 3, 'stride': 2, 'bnorm': True, 'leaky': True, 'layer_idx': 1},
{'filter': 32, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 2},
{'filter': 64, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 3}])
# Layer 5 => 8
x = _conv_block(x, [{'filter': 128, 'kernel': 3, 'stride': 2, 'bnorm': True, 'leaky': True, 'layer_idx': 5},
{'filter': 64, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 6},
{'filter': 128, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 7}])
# Layer 9 => 11
x = _conv_block(x, [{'filter': 64, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 9},
{'filter': 128, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 10}])
# Layer 12 => 15
x = _conv_block(x, [{'filter': 256, 'kernel': 3, 'stride': 2, 'bnorm': True, 'leaky': True, 'layer_idx': 12},
{'filter': 128, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 13},
{'filter': 256, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 14}])
# Layer 16 => 36
for i in range(7):
x = _conv_block(x, [{'filter': 128, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 16+i*3},
{'filter': 256, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 17+i*3}])
skip_36 = x
# Layer 37 => 40
x = _conv_block(x, [{'filter': 512, 'kernel': 3, 'stride': 2, 'bnorm': True, 'leaky': True, 'layer_idx': 37},
{'filter': 256, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 38},
{'filter': 512, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 39}])
# Layer 41 => 61
for i in range(7):
x = _conv_block(x, [{'filter': 256, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 41+i*3},
{'filter': 512, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 42+i*3}])
skip_61 = x
# Layer 62 => 65
x = _conv_block(x, [{'filter': 1024, 'kernel': 3, 'stride': 2, 'bnorm': True, 'leaky': True, 'layer_idx': 62},
{'filter': 512, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 63},
{'filter': 1024, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 64}])
# Layer 66 => 74
for i in range(3):
x = _conv_block(x, [{'filter': 512, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 66+i*3},
{'filter': 1024, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 67+i*3}])
# Layer 75 => 79
x = _conv_block(x, [{'filter': 512, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 75},
{'filter': 1024, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 76},
{'filter': 512, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 77},
{'filter': 1024, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 78},
{'filter': 512, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 79}], skip=False)
# Layer 80 => 82
yolo_82 = _conv_block(x, [{'filter': 1024, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 80},
{'filter': 255, 'kernel': 1, 'stride': 1, 'bnorm': False, 'leaky': False, 'layer_idx': 81}], skip=False)
# Layer 83 => 86
x = _conv_block(x, [{'filter': 256, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 84}], skip=False)
x = UpSampling2D(2)(x)
x = concatenate([x, skip_61])
# Layer 87 => 91
x = _conv_block(x, [{'filter': 256, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 87},
{'filter': 512, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 88},
{'filter': 256, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 89},
{'filter': 512, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 90},
{'filter': 256, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 91}], skip=False)
# Layer 92 => 94
yolo_94 = _conv_block(x, [{'filter': 512, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 92},
{'filter': 255, 'kernel': 1, 'stride': 1, 'bnorm': False, 'leaky': False, 'layer_idx': 93}], skip=False)
# Layer 95 => 98
x = _conv_block(x, [{'filter': 128, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 96}], skip=False)
x = UpSampling2D(2)(x)
x = concatenate([x, skip_36])
# Layer 99 => 106
yolo_106 = _conv_block(x, [{'filter': 128, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 99},
{'filter': 256, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 100},
{'filter': 128, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 101},
{'filter': 256, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 102},
{'filter': 128, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 103},
{'filter': 256, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 104},
{'filter': 255, 'kernel': 1, 'stride': 1, 'bnorm': False, 'leaky': False, 'layer_idx': 105}], skip=False)
model = Model(input_image, [yolo_82, yolo_94, yolo_106])
return model
# define the model by call make_yolov3_model()
model = make_yolov3_model()
(2) load the model weights.
The model weights are stored in whatever format that was used by DarkNet. Rather than trying to decode the file manually, we can use the WeightReader class provided in the script.
To use the WeightReader, it is instantiated with the path to our weights file (e.g. ‘yolov3.weights‘). This will parse the file and load the model weights into memory in a format that we can set into our Keras model.
class WeightReader:
def __init__(self, weight_file):
with open(weight_file, 'rb') as w_f:
major, = struct.unpack('i', w_f.read(4))
minor, = struct.unpack('i', w_f.read(4))
revision, = struct.unpack('i', w_f.read(4))
if (major*10 + minor) >= 2 and major < 1000 and minor < 1000:
w_f.read(8)
else:
w_f.read(4)
transpose = (major > 1000) or (minor > 1000)
binary = w_f.read()
self.offset = 0
self.all_weights = np.frombuffer(binary, dtype='float32')
def read_bytes(self, size):
self.offset = self.offset + size
return self.all_weights[self.offset-size:self.offset]
def load_weights(self, model):
for i in range(106):
try:
conv_layer = model.get_layer('conv_' + str(i))
print("loading weights of convolution #" + str(i))
if i not in [81, 93, 105]:
norm_layer = model.get_layer('bnorm_' + str(i))
size = np.prod(norm_layer.get_weights()[0].shape)
beta = self.read_bytes(size) # bias
gamma = self.read_bytes(size) # scale
mean = self.read_bytes(size) # mean
var = self.read_bytes(size) # variance
weights = norm_layer.set_weights([gamma, beta, mean, var])
if len(conv_layer.get_weights()) > 1:
bias = self.read_bytes(np.prod(conv_layer.get_weights()[1].shape))
kernel = self.read_bytes(np.prod(conv_layer.get_weights()[0].shape))
kernel = kernel.reshape(list(reversed(conv_layer.get_weights()[0].shape)))
kernel = kernel.transpose([2,3,1,0])
conv_layer.set_weights([kernel, bias])
else:
kernel = self.read_bytes(np.prod(conv_layer.get_weights()[0].shape))
kernel = kernel.reshape(list(reversed(conv_layer.get_weights()[0].shape)))
kernel = kernel.transpose([2,3,1,0])
conv_layer.set_weights([kernel])
except ValueError:
print("no convolution #" + str(i))
def reset(self):
self.offset = 0
# load the model weights
weight_reader = WeightReader('yolov3.weights')
# set the model weights into the model
weight_reader.load_weights(model)
loading weights of convolution #0 loading weights of convolution #1 loading weights of convolution #2 loading weights of convolution #3 no convolution #4 loading weights of convolution #5 loading weights of convolution #6 loading weights of convolution #7 no convolution #8 loading weights of convolution #9 loading weights of convolution #10 no convolution #11 loading weights of convolution #12 loading weights of convolution #13 loading weights of convolution #14 no convolution #15 loading weights of convolution #16 loading weights of convolution #17 no convolution #18 loading weights of convolution #19 loading weights of convolution #20 no convolution #21 loading weights of convolution #22 loading weights of convolution #23 no convolution #24 loading weights of convolution #25 loading weights of convolution #26 no convolution #27 loading weights of convolution #28 loading weights of convolution #29 no convolution #30 loading weights of convolution #31 loading weights of convolution #32 no convolution #33 loading weights of convolution #34 loading weights of convolution #35 no convolution #36 loading weights of convolution #37 loading weights of convolution #38 loading weights of convolution #39 no convolution #40 loading weights of convolution #41 loading weights of convolution #42 no convolution #43 loading weights of convolution #44 loading weights of convolution #45 no convolution #46 loading weights of convolution #47 loading weights of convolution #48 no convolution #49 loading weights of convolution #50 loading weights of convolution #51 no convolution #52 loading weights of convolution #53 loading weights of convolution #54 no convolution #55 loading weights of convolution #56 loading weights of convolution #57 no convolution #58 loading weights of convolution #59 loading weights of convolution #60 no convolution #61 loading weights of convolution #62 loading weights of convolution #63 loading weights of convolution #64 no convolution #65 loading weights of convolution #66 loading weights of convolution #67 no convolution #68 loading weights of convolution #69 loading weights of convolution #70 no convolution #71 loading weights of convolution #72 loading weights of convolution #73 no convolution #74 loading weights of convolution #75 loading weights of convolution #76 loading weights of convolution #77 loading weights of convolution #78 loading weights of convolution #79 loading weights of convolution #80 loading weights of convolution #81 no convolution #82 no convolution #83 loading weights of convolution #84 no convolution #85 no convolution #86 loading weights of convolution #87 loading weights of convolution #88 loading weights of convolution #89 loading weights of convolution #90 loading weights of convolution #91 loading weights of convolution #92 loading weights of convolution #93 no convolution #94 no convolution #95 loading weights of convolution #96 no convolution #97 no convolution #98 loading weights of convolution #99 loading weights of convolution #100 loading weights of convolution #101 loading weights of convolution #102 loading weights of convolution #103 loading weights of convolution #104 loading weights of convolution #105
(3) Save this model to a Keras compatible .h5 model file ready for later use.
# save the model to file
model.save('model.h5')
WARNING:tensorflow:Compiled the loaded model, but the compiled metrics have yet to be built. `model.compile_metrics` will be empty until you train or evaluate the model.
- Make a prediction on three images
We download the images from MSCOCO dataset. https://www.kaggle.com/datasets/awsaf49/coco-2017-dataset
# load and prepare an image
def load_image_pixels(filename, shape):
# load the image to get its shape
image = load_img(filename)
width, height = image.size
# load the image with the required size
image = load_img(filename, target_size=shape)
# convert to numpy array
image = img_to_array(image)
# scale pixel values to [0, 1]
image = image.astype('float32')
image /= 255.0
# add a dimension so that we have one sample
image = expand_dims(image, 0)
return image, width, height
class BoundBox:
def __init__(self, xmin, ymin, xmax, ymax, objness = None, classes = None):
self.xmin = xmin
self.ymin = ymin
self.xmax = xmax
self.ymax = ymax
self.objness = objness
self.classes = classes
self.label = -1
self.score = -1
def get_label(self):
if self.label == -1:
self.label = np.argmax(self.classes)
return self.label
def get_score(self):
if self.score == -1:
self.score = self.classes[self.get_label()]
return self.score
def _sigmoid(x):
return 1. / (1. + np.exp(-x))
def decode_netout(netout, anchors, obj_thresh, net_h, net_w):
grid_h, grid_w = netout.shape[:2]
nb_box = 3
netout = netout.reshape((grid_h, grid_w, nb_box, -1))
nb_class = netout.shape[-1] - 5
boxes = []
netout[..., :2] = _sigmoid(netout[..., :2])
netout[..., 4:] = _sigmoid(netout[..., 4:])
netout[..., 5:] = netout[..., 4][..., np.newaxis] * netout[..., 5:]
netout[..., 5:] *= netout[..., 5:] > obj_thresh
for i in range(grid_h*grid_w):
row = i / grid_w
col = i % grid_w
for b in range(nb_box):
# 4th element is objectness score
objectness = netout[int(row)][int(col)][b][4]
if(objectness.all() <= obj_thresh): continue
# first 4 elements are x, y, w, and h
x, y, w, h = netout[int(row)][int(col)][b][:4]
x = (col + x) / grid_w # center position, unit: image width
y = (row + y) / grid_h # center position, unit: image height
w = anchors[2 * b + 0] * np.exp(w) / net_w # unit: image width
h = anchors[2 * b + 1] * np.exp(h) / net_h # unit: image height
# last elements are class probabilities
classes = netout[int(row)][col][b][5:]
box = BoundBox(x-w/2, y-h/2, x+w/2, y+h/2, objectness, classes)
boxes.append(box)
return boxes
def correct_yolo_boxes(boxes, image_h, image_w, net_h, net_w):
new_w, new_h = net_w, net_h
for i in range(len(boxes)):
x_offset, x_scale = (net_w - new_w)/2./net_w, float(new_w)/net_w
y_offset, y_scale = (net_h - new_h)/2./net_h, float(new_h)/net_h
boxes[i].xmin = int((boxes[i].xmin - x_offset) / x_scale * image_w)
boxes[i].xmax = int((boxes[i].xmax - x_offset) / x_scale * image_w)
boxes[i].ymin = int((boxes[i].ymin - y_offset) / y_scale * image_h)
boxes[i].ymax = int((boxes[i].ymax - y_offset) / y_scale * image_h)
def _interval_overlap(interval_a, interval_b):
x1, x2 = interval_a
x3, x4 = interval_b
if x3 < x1:
if x4 < x1:
return 0
else:
return min(x2,x4) - x1
else:
if x2 < x3:
return 0
else:
return min(x2,x4) - x3
def bbox_iou(box1, box2):
intersect_w = _interval_overlap([box1.xmin, box1.xmax], [box2.xmin, box2.xmax])
intersect_h = _interval_overlap([box1.ymin, box1.ymax], [box2.ymin, box2.ymax])
intersect = intersect_w * intersect_h
w1, h1 = box1.xmax-box1.xmin, box1.ymax-box1.ymin
w2, h2 = box2.xmax-box2.xmin, box2.ymax-box2.ymin
union = w1*h1 + w2*h2 - intersect
return float(intersect) / union
def do_nms(boxes, nms_thresh):
if len(boxes) > 0:
nb_class = len(boxes[0].classes)
else:
return
for c in range(nb_class):
sorted_indices = np.argsort([-box.classes[c] for box in boxes])
for i in range(len(sorted_indices)):
index_i = sorted_indices[i]
if boxes[index_i].classes[c] == 0: continue
for j in range(i+1, len(sorted_indices)):
index_j = sorted_indices[j]
if bbox_iou(boxes[index_i], boxes[index_j]) >= nms_thresh:
boxes[index_j].classes[c] = 0
# get all of the results above a threshold
def get_boxes(boxes, labels, thresh):
v_boxes, v_labels, v_scores = list(), list(), list()
# enumerate all boxes
for box in boxes:
# enumerate all possible labels
for i in range(len(labels)):
# check if the threshold for this label is high enough
if box.classes[i] > thresh:
v_boxes.append(box)
v_labels.append(labels[i])
v_scores.append(box.classes[i]*100)
# don't break, many labels may trigger for one box
return v_boxes, v_labels, v_scores
# draw all results
def draw_boxes(filename, v_boxes, v_labels, v_scores):
# load the image
data = pyplot.imread(filename)
# plot the image
pyplot.imshow(data)
# get the context for drawing boxes
ax = pyplot.gca()
# plot each box
for i in range(len(v_boxes)):
box = v_boxes[i]
# get coordinates
y1, x1, y2, x2 = box.ymin, box.xmin, box.ymax, box.xmax
# calculate width and height of the box
width, height = x2 - x1, y2 - y1
# create the shape
rect = Rectangle((x1, y1), width, height, fill=False, color='white')
# draw the box
ax.add_patch(rect)
# draw text and score in top left corner
label = "%s (%.3f)" % (v_labels[i], v_scores[i])
pyplot.text(x1, y1, label, color='white')
# show the plot
pyplot.show()
(1) The prediction on images of people
To predict on images of people with the non-maximal suppression value is 0.5.
def prediction_yolov3(img_file, non_ms_thresh):
# define the expected input shape for the model
input_w, input_h = 416, 416
# define our new photo
photo_filename = img_file
# load and prepare image
image, image_w, image_h = load_image_pixels(photo_filename, (input_w, input_h))
# make prediction
yhat = model.predict(image)
# summarize the shape of the list of arrays
print([a.shape for a in yhat])
# define the anchors
anchors = [[116,90, 156,198, 373,326], [30,61, 62,45, 59,119], [10,13, 16,30, 33,23]]
# define the probability threshold for detected objects
class_threshold = 0.6
boxes = list()
for i in range(len(yhat)):
# decode the output of the network
boxes += decode_netout(yhat[i][0], anchors[i], class_threshold, input_h, input_w)
# correct the sizes of the bounding boxes for the shape of the image
correct_yolo_boxes(boxes, image_h, image_w, input_h, input_w)
# suppress non-maximal boxes
do_nms(boxes, non_ms_thresh)
# define the labels
labels = ["person", "bicycle", "car", "motorbike", "aeroplane", "bus", "train", "truck",
"boat", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench",
"bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe",
"backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard",
"sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
"tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana",
"apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake",
"chair", "sofa", "pottedplant", "bed", "diningtable", "toilet", "tvmonitor", "laptop", "mouse",
"remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator",
"book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush"]
# get the details of the detected objects
v_boxes, v_labels, v_scores = get_boxes(boxes, labels, class_threshold)
# summarize what we found
for i in range(len(v_boxes)):
print(v_labels[i], v_scores[i])
# draw what we found
draw_boxes(photo_filename, v_boxes, v_labels, v_scores)
return v_boxes,v_labels,v_scores
#to predict the image of people
print("non-maximal suppression is 0.5")
v_boxes_peopel_5, labels_peopel_5, scores_peopel_5 = prediction_yolov3("people.jpg",0.5)
non-maximal suppression is 0.5 1/1 [==============================] - 1s 1s/step [(1, 13, 13, 255), (1, 26, 26, 255), (1, 52, 52, 255)] person 99.51162338256836 person 88.29629421234131 person 99.6243953704834 person 98.4839677810669 person 97.77601361274719 handbag 60.902976989746094 person 93.30617785453796 tie 95.68508267402649 tie 64.77476954460144 tie 94.2353904247284 tie 86.83189749717712
(2) The prediction on images of animals
To predict on images of animals with the non-maximal suppression value is 0.5.
#to predict the image of animals
print("non-maximal suppression is 0.5")
v_boxes_animals_5, labels_animals_5, scores_animals_5 = prediction_yolov3("animals.jpg",0.5)
non-maximal suppression is 0.5 1/1 [==============================] - 0s 362ms/step [(1, 13, 13, 255), (1, 26, 26, 255), (1, 52, 52, 255)] cow 98.87601137161255 cow 98.30766916275024 cow 83.11805725097656 cow 96.12895846366882 cow 97.69335389137268
(3) The prediction on images of objects
To predict on images of objects with the non-maximal suppression value is 0.5, 0.3, 0.8.
#to predict the image of objects
print("non-maximal suppression is 0.5")
v_boxes_objects_5, labels_objects_5, scores_objects_5 = prediction_yolov3("objects.jpg",0.5)
non-maximal suppression is 0.5 1/1 [==============================] - 0s 321ms/step [(1, 13, 13, 255), (1, 26, 26, 255), (1, 52, 52, 255)] aeroplane 99.57087635993958
- Explain the purpose of the non-max suppression.
The purpose of non-max suppression in YOLOv3 is to remove redundant or overlapping bounding boxes that have been detected for the same object.
Non-max suppression works by first sorting the detected bounding boxes based on their confidence scores. It then selects the bounding box with the highest confidence score and removes all other bounding boxes that have a high overlap (i.e., intersection over union or IoU) with this box. This process is repeated until there are no more boxes left to be processed.
- Repeat step 3 by using the different values for non-max suppression with different value and what we can observe.
#to predict the image of objects with the non-maximal suppression is 0.3,0.8
print("non-maximal suppression is 0.3")
v_boxes_peopel_3, labels_peopel_3, scores_peopel_3 = prediction_yolov3("people.jpg",0.3)
print("non-maximal suppression is 0.8")
v_boxes_peopel_8, labels_peopel_8, scores_peopel_8 = prediction_yolov3("people.jpg",0.8)
non-maximal suppression is 0.3 1/1 [==============================] - 0s 323ms/step [(1, 13, 13, 255), (1, 26, 26, 255), (1, 52, 52, 255)] person 99.51162338256836 person 88.29629421234131 person 99.6243953704834 person 98.4839677810669 person 97.77601361274719 handbag 60.902976989746094 person 93.30617785453796 tie 95.68508267402649 tie 64.77476954460144 tie 94.2353904247284 tie 86.83189749717712
non-maximal suppression is 0.8 1/1 [==============================] - 0s 313ms/step [(1, 13, 13, 255), (1, 26, 26, 255), (1, 52, 52, 255)] person 71.15631699562073 person 65.54453372955322 person 99.51162338256836 person 88.29629421234131 person 99.6243953704834 person 98.4839677810669 person 97.77601361274719 handbag 60.902976989746094 person 62.80094385147095 person 90.26818871498108 person 93.30617785453796 person 87.87543177604675 person 73.77721071243286 person 97.6821780204773 tie 95.68508267402649 tie 89.40243721008301 tie 64.77476954460144 tie 94.2353904247284 tie 86.83189749717712
#to predict the image of animals with the non-maximal suppression is 0.3,0.8
print("non-maximal suppression is 0.3")
v_boxes_animals_3, labels_animals_3, scores_animals_3 = prediction_yolov3("animals.jpg",0.3)
print("non-maximal suppression is 0.8")
v_boxes_animals_8, labels_animals_8, scores_animals_8 = prediction_yolov3("animals.jpg",0.8)
non-maximal suppression is 0.3 1/1 [==============================] - 0s 444ms/step [(1, 13, 13, 255), (1, 26, 26, 255), (1, 52, 52, 255)] cow 98.87601137161255 cow 98.30766916275024 cow 83.11805725097656 cow 96.12895846366882 cow 97.69335389137268
non-maximal suppression is 0.8 1/1 [==============================] - 0s 306ms/step [(1, 13, 13, 255), (1, 26, 26, 255), (1, 52, 52, 255)] cow 89.96080160140991 cow 62.95619606971741 cow 98.87601137161255 cow 98.30766916275024 cow 83.11805725097656 cow 96.12895846366882 cow 97.69335389137268
#to predict the image of objects with the non-maximal suppression is 0.3,0.8
print("non-maximal suppression is 0.3")
v_boxes_objects_3, labels_objects_3, scores_objects_3 = prediction_yolov3("objects.jpg",0.3)
print("non-maximal suppression is 0.8")
v_boxes_objects_8, labels_objects_8, scores_objects_8 = prediction_yolov3("objects.jpg",0.8)
non-maximal suppression is 0.3 1/1 [==============================] - 0s 371ms/step [(1, 13, 13, 255), (1, 26, 26, 255), (1, 52, 52, 255)] aeroplane 99.57087635993958
non-maximal suppression is 0.8 1/1 [==============================] - 0s 286ms/step [(1, 13, 13, 255), (1, 26, 26, 255), (1, 52, 52, 255)] aeroplane 98.01173806190491 aeroplane 99.57087635993958 aeroplane 78.6253809928894
#plot the prediction of the images
# Create a 3x3 grid of subplots
fig, axes = pyplot.subplots(3, 3, figsize=(15, 15))
axes = axes.flatten()
# Load 9 sample images
images_path= ["./people.jpg", "./people.jpg", "./people.jpg",
"./animals.jpg", "./animals.jpg", "./animals.jpg",
"./objects.jpg", "./objects.jpg", "./objects.jpg"]
titles = ["people,nms=0.3", "people,nms=0.5", "people,nms=0.8",
"animals,nms=0.3", "animals,nms=0.5", "animals,nms=0.8",
"animals,nms=0.3", "animals,nms=0.5", "animals,nms=0.8"]
detected_info = [[v_boxes_peopel_3, labels_peopel_3, scores_peopel_3],[v_boxes_peopel_5, labels_peopel_5, scores_peopel_5],[v_boxes_peopel_8, labels_peopel_8, scores_peopel_8],
[v_boxes_animals_3, labels_animals_3, scores_animals_3],[v_boxes_animals_5, labels_animals_5, scores_animals_5],[v_boxes_animals_8, labels_animals_8, scores_animals_8],
[v_boxes_objects_3, labels_objects_3, scores_objects_3],[v_boxes_objects_5, labels_objects_5, scores_objects_5],[v_boxes_objects_8, labels_objects_8, scores_objects_8]]
images = [pyplot.imread(path) for path in images_path]
# Loop through the subplots and plot each image
for j, ax in enumerate(axes):
ax.imshow(images[j])
ax.set_title(titles[j])
v_boxes = detected_info[j][0]
v_labels = detected_info[j][1]
v_scores = detected_info[j][2]
for i in range(len(v_boxes)):
box = v_boxes[i]
# get coordinates
y1, x1, y2, x2 = box.ymin, box.xmin, box.ymax, box.xmax
# calculate width and height of the box
width, height = x2 - x1, y2 - y1
# create the shape
rect = Rectangle((x1, y1), width, height, fill=False, color='white')
# draw the box
ax.add_patch(rect)
# draw text and score in top left corner
label = "%s (%.3f)" % (v_labels[i], v_scores[i])
ax.text(x1, y1, label, color='white')
ax.axis('off')
# Display the plot
pyplot.tight_layout()
pyplot.show()
Above pictures showed that the results of non-max suppression is 0.3 are similar to non-max suppression is 0.5, but when non-max suppression is 0.8, there are more bounding box.
Because non-max suppression works by first sorting the detected bounding boxes based on their confidence scores. It then selects the bounding box with the highest confidence score and removes all other bounding boxes that have a high overlap (i.e., intersection over union or IoU) with this box. This process is repeated until there are no more boxes left to be processed.